In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#set background color grey
sns.set_theme(style="darkgrid")
In [ ]:
df = pd.read_csv("all_turns_2.csv")
df = df[df['person_robot'] == 'robot']
df.drop(columns=['Unnamed: 0'], inplace=True)
df['turn_duration'] = 0.2*(df['end_idx'].astype('float') - df['start_idx'].astype('float'))
df.describe().T
Out[Â ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| participant_id | 107.0 | 1851.644860 | 608.431703 | 407.000000 | 2102.000000 | 2105.000000 | 2107.000000 | 2111.000000 |
| path_num | 107.0 | 2.261682 | 0.743988 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 |
| turn_num | 107.0 | 2.644860 | 1.864527 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 10.000000 |
| start_idx | 107.0 | 527.757009 | 425.483695 | 57.000000 | 238.000000 | 420.000000 | 660.000000 | 2164.000000 |
| end_idx | 107.0 | 586.364486 | 426.185857 | 83.000000 | 297.500000 | 489.000000 | 708.000000 | 2264.000000 |
| walking_direction_lag | 107.0 | -4.775701 | 23.691535 | -79.000000 | -15.500000 | 0.000000 | 7.000000 | 64.000000 |
| walking_direction_base_corr | 107.0 | 0.065443 | 0.398870 | -0.706492 | -0.308203 | 0.082524 | 0.401292 | 0.866582 |
| walking_direction_lagged_corr | 107.0 | 0.472802 | 0.152860 | 0.188645 | 0.367763 | 0.447397 | 0.563752 | 0.955078 |
| walking_direction_dtw | 107.0 | 49.845938 | 28.586508 | 4.432664 | 28.554681 | 45.706446 | 60.959196 | 162.955230 |
| speeds_lag | 107.0 | -3.093458 | 17.179701 | -69.000000 | -12.000000 | -1.000000 | 4.500000 | 44.000000 |
| speeds_base_corr | 107.0 | 0.144590 | 0.318698 | -0.663930 | -0.092252 | 0.192048 | 0.391395 | 0.839497 |
| speeds_lagged_corr | 107.0 | 0.466142 | 0.135817 | 0.194506 | 0.367961 | 0.460162 | 0.547553 | 0.839497 |
| speeds_dtw | 107.0 | 40.684982 | 19.690040 | 10.222585 | 27.259617 | 35.758098 | 50.797445 | 141.492438 |
| mean_distance | 107.0 | 2.407653 | 1.329329 | 0.336612 | 1.462095 | 2.205431 | 3.052211 | 9.579321 |
| mean_speed_difference | 107.0 | 0.406715 | 0.154904 | 0.086809 | 0.308490 | 0.368687 | 0.479630 | 0.915065 |
| mean_walking_direction_difference | 107.0 | 60.972269 | 18.422779 | 14.856844 | 49.219162 | 61.661211 | 72.246630 | 120.316045 |
| mean_pace_asymmetry | 107.0 | 0.443048 | 0.123088 | 0.173722 | 0.358705 | 0.436978 | 0.505316 | 0.804450 |
| turn_duration | 107.0 | 11.721495 | 6.440600 | 5.000000 | 7.000000 | 11.000000 | 13.800000 | 46.200000 |
In [ ]:
# box plot turn duration
sns.boxplot(x=df['turn_duration'])
plt.title('Box plot of turn duration')
plt.show()
In [ ]:
df['normalized_walking_direction_dtw'] = df['walking_direction_dtw'] / (df['turn_duration'] / 0.2)
df['normalized_speeds_dtw'] = df['speeds_dtw'] / (df['turn_duration'] / 0.2)
In [ ]:
df['abs_walking_direction_lag'] = df['walking_direction_lag'].abs()
df['abs_speeds_lag'] = df['speeds_lag'].abs()
In [ ]:
relevant_features = [
'turn_duration',
'mean_distance',
'mean_pace_asymmetry',
'walking_direction_lag',
'abs_walking_direction_lag',
'walking_direction_dtw',
'normalized_walking_direction_dtw',
# 'walking_direction_base_corr',
'walking_direction_lagged_corr',
# 'mean_walking_direction_difference',
'speeds_lag',
'abs_speeds_lag',
'speeds_dtw',
'normalized_speeds_dtw',
# 'speeds_base_corr',
'speeds_lagged_corr',
# 'mean_speed_difference',
]
In [ ]:
corr = df[relevant_features].corr(method='pearson',numeric_only=True)
mask = np.abs(corr) < 0.3
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", mask=mask)
# make it bigger
plt.title(f"Metrics Correlation Matrix - Original Data (n={len(df)})")
plt.show()
In [ ]:
# find rows with the same participant_id and path_num with overlapping start_idx and end_idx
df['overlapping'] = False
for index, row in df.iterrows():
if len(df[(df['participant_id'] == row['participant_id']) & (df['path_num'] == row['path_num']) & (df['start_idx'] >= row['start_idx']) & (df['start_idx'] <= row['end_idx'])]) > 1 or \
len(df[(df['participant_id'] == row['participant_id']) & (df['path_num'] == row['path_num']) & (df['end_idx'] >= row['start_idx']) & (df['end_idx'] <= row['end_idx'])]) > 1:
df.at[index, 'overlapping'] = True
# overlapping_and_not_subset = df[df['overlapping'] == True]
overlapping_and_not_subset = df
oans = overlapping_and_not_subset
corr_oans = oans[relevant_features].corr(method='pearson',numeric_only=True)
mask = np.abs(corr_oans) < 0.3
plt.figure(figsize=(12, 10))
sns.heatmap(corr_oans, annot=True, fmt=".2f", mask=mask)
plt.title(f"Metrics Correlation Matrix - Overlapping Data (n={len(oans)})")
plt.show()
In [ ]:
threshold = 0.3
filtered_df = df[(df['walking_direction_lagged_corr'] > threshold) & (df['speeds_lagged_corr'] > threshold)]
# filtered_oans = filtered_df[filtered_df['overlapping'] == True]
filtered_oans = filtered_df
corr_filtered_oans = filtered_oans[relevant_features].corr(method='pearson',numeric_only=True)
mask = (np.abs(corr_filtered_oans) < 0.3)
plt.figure(figsize=(12, 10))
sns.heatmap(corr_filtered_oans, annot=True, fmt=".2f", mask=mask)
plt.title(f"Metrics Correlation Matrix - Filtered Overlapping Data (n={len(filtered_oans)})")
plt.show()
In [ ]:
filtered_oans.describe().T
Out[Â ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| participant_id | 84.0 | 1862.857143 | 597.721379 | 407.000000 | 2102.000000 | 2105.000000 | 2107.000000 | 2111.000000 |
| path_num | 84.0 | 2.190476 | 0.752125 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 |
| turn_num | 84.0 | 2.416667 | 1.592212 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 8.000000 |
| start_idx | 84.0 | 501.000000 | 382.805056 | 57.000000 | 230.250000 | 403.000000 | 658.000000 | 1720.000000 |
| end_idx | 84.0 | 557.083333 | 379.685070 | 83.000000 | 292.250000 | 456.000000 | 700.250000 | 1800.000000 |
| walking_direction_lag | 84.0 | -5.535714 | 22.021045 | -79.000000 | -14.750000 | 0.000000 | 7.000000 | 43.000000 |
| walking_direction_base_corr | 84.0 | 0.091925 | 0.409362 | -0.706492 | -0.261322 | 0.108055 | 0.417556 | 0.866582 |
| walking_direction_lagged_corr | 84.0 | 0.501833 | 0.145125 | 0.303473 | 0.390332 | 0.482601 | 0.599382 | 0.955078 |
| walking_direction_dtw | 84.0 | 47.606303 | 27.863207 | 4.432664 | 27.602502 | 43.471100 | 58.967774 | 162.955230 |
| speeds_lag | 84.0 | -3.761905 | 13.922868 | -48.000000 | -11.250000 | -1.000000 | 3.000000 | 32.000000 |
| speeds_base_corr | 84.0 | 0.182460 | 0.323466 | -0.663930 | -0.058960 | 0.214539 | 0.435689 | 0.839497 |
| speeds_lagged_corr | 84.0 | 0.497610 | 0.125933 | 0.301195 | 0.394321 | 0.478842 | 0.568946 | 0.839497 |
| speeds_dtw | 84.0 | 38.275962 | 16.284959 | 10.222585 | 27.005187 | 34.873133 | 47.807880 | 89.189543 |
| mean_distance | 84.0 | 2.472509 | 1.403922 | 0.471139 | 1.469126 | 2.213552 | 3.249714 | 9.579321 |
| mean_speed_difference | 84.0 | 0.391795 | 0.149112 | 0.103170 | 0.309907 | 0.359158 | 0.441056 | 0.915065 |
| mean_walking_direction_difference | 84.0 | 59.442886 | 17.176702 | 14.856844 | 47.398511 | 61.236454 | 71.112776 | 102.815835 |
| mean_pace_asymmetry | 84.0 | 0.430764 | 0.112938 | 0.209666 | 0.347402 | 0.410259 | 0.479218 | 0.778551 |
| turn_duration | 84.0 | 11.216667 | 5.224706 | 5.000000 | 7.300000 | 10.300000 | 13.600000 | 34.200000 |
| normalized_walking_direction_dtw | 84.0 | 0.851407 | 0.307541 | 0.152850 | 0.665227 | 0.814061 | 0.973838 | 1.683822 |
| normalized_speeds_dtw | 84.0 | 0.714920 | 0.198747 | 0.328462 | 0.574489 | 0.707928 | 0.812527 | 1.395960 |
| abs_walking_direction_lag | 84.0 | 15.630952 | 16.391138 | 0.000000 | 3.000000 | 10.000000 | 23.250000 | 79.000000 |
| abs_speeds_lag | 84.0 | 9.571429 | 10.744890 | 0.000000 | 2.000000 | 6.000000 | 13.000000 | 48.000000 |
In [ ]:
from scipy.stats import pearsonr
for feature in relevant_features:
to_display = []
for feature2 in relevant_features:
if not feature.startswith(feature2) and not feature2.startswith(feature) and not feature.endswith(feature2) and not feature2.endswith(feature)\
and np.abs(corr_filtered_oans.loc[feature, feature2]) > 0.3:
to_display.append(feature2)
if len(to_display) == 0:
continue
# set plot grid of 1xlen(to_display)
fig, axs = plt.subplots(int(np.ceil(len(to_display)/3)), min(len(to_display),3), figsize=(5*min(len(to_display),3),5*int(np.ceil(len(to_display)/3))))
# print(axs.shape)
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature2 in enumerate(to_display):
peares = pearsonr(filtered_oans[feature], filtered_oans[feature2], alternative='two-sided')
pcorr, p_val = peares.statistic, peares.pvalue
CI = peares.confidence_interval(confidence_level=0.95)
# Scatter plot
sns.scatterplot(x=feature, y=feature2, data=filtered_oans, ax=axs[int(np.ceil(i/3))-1,i%3])
# Regression line
sns.regplot(x=feature, y=feature2, data=filtered_oans, scatter=False, line_kws={'color': 'red'}, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(f"compared with {feature2}\ncorr: {round(corr_filtered_oans.loc[feature, feature2], 3)}, p_val: {round(p_val,5)}, CI: {[round(c,3) for c in CI]}", fontweight='bold')
# add title "feature vs correlated features" to the plot
fig.suptitle(f"{feature}'s correlations", fontweight='bold')
plt.tight_layout()
plt.show()
In [ ]:
from PIL import Image
import seaborn as sns
from scipy import stats
# for each feature, find highest and lowest valued row and display them
for feature in relevant_features:
# print(f"Feature: {feature}")
# print("Highest values:")
h_res = filtered_oans.loc[filtered_oans[feature].nlargest(1).index, ['participant_id', 'person_robot', 'path_num', 'turn_num', feature]]
# print(h_res)
h_base_path = f"./turns/{h_res['participant_id'].values[0]}/{h_res['person_robot'].values[0]}/run_{h_res['path_num'].values[0]}/turn_{h_res['turn_num'].values[0]}/"
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
paths_img = Image.open(h_base_path + "paths.png")
axs[0].imshow(paths_img)
axs[0].axis('off')
distance_img = Image.open(h_base_path + "distance.png")
axs[1].imshow(distance_img)
axs[1].axis('off')
walking_directions_img = Image.open(h_base_path + "walking_directions.png")
axs[2].imshow(walking_directions_img)
axs[2].axis('off')
speeds_img = Image.open(h_base_path + "speeds.png")
axs[3].imshow(speeds_img)
axs[3].axis('off')
to_print_dict = {k: round(v_val, 3) if isinstance(v_val, float) else v_val for k,v in h_res.to_dict().items() for v_key, v_val in v.items()}
to_print_str = ", ".join([f"{k}: {v}" for k,v in to_print_dict.items()])
fig.suptitle(f"{feature} - highest value\n {to_print_str}", fontweight='bold')
plt.tight_layout()
plt.show()
# print("Lowest values:")
l_res = filtered_oans.loc[filtered_oans[feature].nsmallest(1).index, ['participant_id', 'person_robot', 'path_num', 'turn_num', feature]]
# print(l_res)
l_base_path = f"./turns/{l_res['participant_id'].values[0]}/{l_res['person_robot'].values[0]}/run_{l_res['path_num'].values[0]}/turn_{l_res['turn_num'].values[0]}/"
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
paths_img = Image.open(l_base_path + "paths.png")
axs[0].imshow(paths_img)
axs[0].axis('off')
distance_img = Image.open(l_base_path + "distance.png")
axs[1].imshow(distance_img)
axs[1].axis('off')
walking_directions_img = Image.open(l_base_path + "walking_directions.png")
axs[2].imshow(walking_directions_img)
axs[2].axis('off')
speeds_img = Image.open(l_base_path + "speeds.png")
axs[3].imshow(speeds_img)
axs[3].axis('off')
to_print_dict = {k: round(v_val, 3) if isinstance(v_val, float) else v_val for k,v in l_res.to_dict().items() for v_key, v_val in v.items()}
to_print_str = ", ".join([f"{k}: {v}" for k,v in to_print_dict.items()])
fig.suptitle(f"{feature} - lowest value\n {to_print_str}", fontweight='bold')
plt.tight_layout()
plt.show()
print("\n\n")
In [ ]:
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
sns.histplot(data=filtered_oans, x=feature, kde=True, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('')
plt.tight_layout()
plt.show()
In [ ]:
import scipy.stats as stats
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
stats.probplot(filtered_oans[feature], dist="norm", plot=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('Theoretical Quantiles')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('Ordered Values')
plt.tight_layout()
plt.show()
In [ ]:
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
sns.boxplot(data=filtered_oans, y=feature, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('')
plt.tight_layout()
plt.show()